import pickle
import helpsk as hlp
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder # , LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
# from sklearn.base import BaseEstimator, TransformerMixin
%matplotlib inline
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = 'Run 1 - XGBoost - BayesSearchCV.yaml')
print(results.best_score)
results.best_params
0.7705175280760079
{'model': 'XGBClassifier()',
'max_depth': 8,
'learning_rate': 0.0036325173225203837,
'n_estimators': 1341,
'min_child_weight': 2,
'subsample': 0.7998768156287402,
'colsample_bytree': 0.8892648282704134,
'colsample_bylevel': 0.5213921375991398,
'reg_alpha': 0.39963322595869505,
'reg_lambda': 1.8262863809878243,
'imputer': "SimpleImputer(strategy='median')",
'scaler': 'None',
'encoder': 'OneHotEncoder()'}
with open('../X_train.pkl', 'rb') as handle:
X_train = pickle.load(handle)
print(X_train.shape)
with open('../y_train.pkl', 'rb') as handle:
y_train = pickle.load(handle)
print(len(y_train))
with open('../X_test.pkl', 'rb') as handle:
X_test = pickle.load(handle)
print(X_test.shape)
with open('../y_test.pkl', 'rb') as handle:
y_test = pickle.load(handle)
print(len(y_test))
(800, 20) 800 (200, 20) 200
X_train.head()
| checking_status | duration | credit_history | purpose | credit_amount | savings_status | employment | installment_commitment | personal_status | other_parties | residence_since | property_magnitude | age | other_payment_plans | housing | existing_credits | job | num_dependents | own_telephone | foreign_worker | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 29 | NaN | NaN | delayed previously | business | 0.0 | <100 | >=7 | 3.0 | male single | none | 4.0 | no known property | 63.0 | none | own | 2.0 | skilled | 1.0 | yes | yes |
| 535 | >=200 | 21.0 | critical/other existing credit | education | 2319.0 | <100 | <1 | 2.0 | male div/sep | none | 1.0 | car | 33.0 | none | rent | 1.0 | skilled | 1.0 | none | yes |
| 695 | no checking | 6.0 | existing paid | used car | 1236.0 | 500<=X<1000 | 1<=X<4 | 2.0 | male single | none | 4.0 | life insurance | 50.0 | none | rent | 1.0 | skilled | 1.0 | none | yes |
| 557 | no checking | 21.0 | no credits/all paid | new car | 5003.0 | no known savings | 1<=X<4 | 1.0 | female div/dep/mar | none | 4.0 | life insurance | 29.0 | bank | own | 2.0 | skilled | 1.0 | yes | yes |
| 836 | no checking | 12.0 | existing paid | radio/tv | 886.0 | no known savings | 1<=X<4 | 4.0 | female div/dep/mar | none | 2.0 | car | 21.0 | none | own | 1.0 | skilled | 1.0 | none | yes |
y_train[0:10]
array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0])
X_test.head()
| checking_status | duration | credit_history | purpose | credit_amount | savings_status | employment | installment_commitment | personal_status | other_parties | residence_since | property_magnitude | age | other_payment_plans | housing | existing_credits | job | num_dependents | own_telephone | foreign_worker | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 521 | <0 | 18.0 | existing paid | radio/tv | 3190.0 | <100 | 1<=X<4 | 2.0 | female div/dep/mar | none | 2.0 | real estate | 24.0 | none | own | 1.0 | skilled | 1.0 | none | yes |
| 737 | <0 | 18.0 | existing paid | new car | 4380.0 | 100<=X<500 | 1<=X<4 | 3.0 | male single | none | 4.0 | car | 35.0 | none | own | 1.0 | unskilled resident | 2.0 | yes | yes |
| 740 | <0 | 24.0 | all paid | new car | 2325.0 | 100<=X<500 | 4<=X<7 | 2.0 | male single | none | 3.0 | car | 32.0 | bank | own | 1.0 | skilled | 1.0 | none | yes |
| 660 | >=200 | 12.0 | existing paid | radio/tv | 1297.0 | <100 | 1<=X<4 | 3.0 | male mar/wid | none | 4.0 | real estate | 23.0 | none | rent | 1.0 | skilled | 1.0 | none | yes |
| 411 | no checking | 33.0 | critical/other existing credit | used car | 7253.0 | <100 | 4<=X<7 | 3.0 | male single | none | 2.0 | car | 35.0 | none | own | 2.0 | high qualif/self emp/mgmt | 1.0 | yes | yes |
y_test[0:10]
array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
y_train[0:10]
array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0])
numeric_columns = hlp.pandas.get_numeric_columns(X_train)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(X_train)
print(numeric_columns)
print(non_numeric_columns)
['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents'] ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']
results.best_params
{'model': 'XGBClassifier()',
'max_depth': 8,
'learning_rate': 0.0036325173225203837,
'n_estimators': 1341,
'min_child_weight': 2,
'subsample': 0.7998768156287402,
'colsample_bytree': 0.8892648282704134,
'colsample_bylevel': 0.5213921375991398,
'reg_alpha': 0.39963322595869505,
'reg_lambda': 1.8262863809878243,
'imputer': "SimpleImputer(strategy='median')",
'scaler': 'None',
'encoder': 'OneHotEncoder()'}
model_params = results.best_params.copy()
del model_params['model']
del model_params['imputer']
del model_params['scaler']
del model_params['encoder']
model_params
{'max_depth': 8,
'learning_rate': 0.0036325173225203837,
'n_estimators': 1341,
'min_child_weight': 2,
'subsample': 0.7998768156287402,
'colsample_bytree': 0.8892648282704134,
'colsample_bylevel': 0.5213921375991398,
'reg_alpha': 0.39963322595869505,
'reg_lambda': 1.8262863809878243}
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
full_pipeline = make_pipeline(
ColumnTransformer([
(
'numeric',
make_pipeline(
SimpleImputer(strategy='median'),
#StandardScaler()
),
numeric_columns
),
(
'non_numeric',
make_pipeline(OneHotEncoder()),
non_numeric_columns
)
]),
XGBClassifier(
random_state=42,
use_label_encoder=False,
eval_metric='logloss',
**model_params,
)
)
# Show the levels of pipelines/transformers/model
#full_pipeline.named_steps
fitted_pipeline = full_pipeline.fit(X_train, y_train)
# fitted_pipeline.predict(X_test)
# fitted_pipeline.predict_proba(X_test)
# y_test
Understand the nature/degree of model overfitting by comparing to Test Set Evaluation
training_evaluator = hlp.sklearn_eval.TwoClassEvaluator(
actual_values=y_train,
predicted_scores=fitted_pipeline.predict_proba(X_train)[:, 1],
score_threshold=0.5
)
training_evaluator.plot_predicted_scores_histogram()
training_evaluator.plot_actual_vs_predict_histogram()
predicted_scores = fitted_pipeline.predict_proba(X_test)[:, 1]
# save the predictions so that we can compare across models
with open('test_set_predictions.pkl', 'wb') as handle:
pickle.dump(predicted_scores, handle)
evaluator = hlp.sklearn_eval.TwoClassEvaluator(
actual_values=y_test,
predicted_scores=predicted_scores,
positive_class='Defaulted',
negative_class='Not Defaulted',
score_threshold=0.5
)
del predicted_scores
evaluator.plot_predicted_scores_histogram()
evaluator.plot_actual_vs_predict_histogram()
evaluator.plot_confusion_matrix()
evaluator.all_metrics_df(return_style=True,
dummy_classifier_strategy=['prior', 'constant'],
round_by=3)
| Score | Dummy (prior) | Dummy (constant) | Explanation | |
|---|---|---|---|---|
| AUC | 0.806 | 0.500 | 0.500 | Area under the ROC curve (true pos. rate vs false pos. rate); ranges from 0.5 (purely random classifier) to 1.0 (perfect classifier) |
| True Positive Rate | 0.508 | 0.000 | 1.000 | 50.8% of positive instances were correctly identified.; i.e. 30 "Defaulted" labels were correctly identified out of 59 instances; a.k.a Sensitivity/Recall |
| True Negative Rate | 0.936 | 1.000 | 0.000 | 93.6% of negative instances were correctly identified.; i.e. 132 "Not Defaulted" labels were correctly identified out of 141 instances |
| False Positive Rate | 0.064 | 0.000 | 1.000 | 6.4% of negative instances were incorrectly identified as positive; i.e. 9 "Not Defaulted" labels were incorrectly identified as "Defaulted", out of 141 instances |
| False Negative Rate | 0.492 | 1.000 | 0.000 | 49.2% of positive instances were incorrectly identified as negative; i.e. 29 "Defaulted" labels were incorrectly identified as "Not Defaulted", out of 59 instances |
| Positive Predictive Value | 0.769 | 0.000 | 0.295 | When the model claims an instance is positive, it is correct 76.9% of the time; i.e. out of the 39 times the model predicted "Defaulted", it was correct 30 times; a.k.a precision |
| Negative Predictive Value | 0.820 | 0.705 | 0.000 | When the model claims an instance is negative, it is correct 82.0% of the time; i.e. out of the 161 times the model predicted "Not Defaulted", it was correct 132 times |
| F1 Score | 0.612 | 0.000 | 0.456 | The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. |
| Accuracy | 0.810 | 0.705 | 0.295 | 81.0% of instances were correctly identified |
| Error Rate | 0.190 | 0.295 | 0.705 | 19.0% of instances were incorrectly identified |
| % Positive | 0.295 | 0.295 | 0.295 | 29.5% of the data are positive; i.e. out of 200 total observations; 59 are labeled as "Defaulted" |
| Total Observations | 200 | 200 | 200 | There are 200 total observations; i.e. sample size |
fig = evaluator.plot_auc_curve(return_plotly=True)
fig.show()
<Figure size 720x444.984 with 0 Axes>
fig = evaluator.plot_threshold_curves(score_threshold_range=(0.1, 0.6),
return_plotly=True)
fig.show()
<Figure size 720x444.984 with 0 Axes>
fig = evaluator.plot_precision_recall_tradeoff(score_threshold_range=(0.1, 0.6),
return_plotly=True)
fig.show()
<Figure size 720x444.984 with 0 Axes>
evaluator.calculate_lift_gain(return_style=True)
| Gain | Lift | |
|---|---|---|
| Percentile | ||
| 5 | 0.15 | 3.05 |
| 10 | 0.24 | 2.37 |
| 15 | 0.37 | 2.49 |
| 20 | 0.51 | 2.54 |
| 25 | 0.56 | 2.24 |
| 30 | 0.59 | 1.98 |
| 35 | 0.68 | 1.94 |
| 40 | 0.73 | 1.82 |
| 45 | 0.76 | 1.69 |
| 50 | 0.80 | 1.59 |
| 55 | 0.86 | 1.57 |
| 60 | 0.90 | 1.50 |
| 65 | 0.90 | 1.38 |
| 70 | 0.90 | 1.28 |
| 75 | 0.90 | 1.20 |
| 80 | 0.93 | 1.17 |
| 85 | 1.00 | 1.18 |
| 90 | 1.00 | 1.11 |
| 95 | 1.00 | 1.05 |
| 100 | 1.00 | 1.00 |
evaluator.calculate_lift_gain(return_style=True, include_all_info=True)
| # of Obs. | # of Pos. Events | Cumul. Pos. Events | Gain | Lift | |
|---|---|---|---|---|---|
| Percentile | |||||
| 5 | 10 | 9 | 9 | 0.15 | 3.05 |
| 10 | 10 | 5 | 14 | 0.24 | 2.37 |
| 15 | 10 | 8 | 22 | 0.37 | 2.49 |
| 20 | 10 | 8 | 30 | 0.51 | 2.54 |
| 25 | 10 | 3 | 33 | 0.56 | 2.24 |
| 30 | 10 | 2 | 35 | 0.59 | 1.98 |
| 35 | 10 | 5 | 40 | 0.68 | 1.94 |
| 40 | 10 | 3 | 43 | 0.73 | 1.82 |
| 45 | 10 | 2 | 45 | 0.76 | 1.69 |
| 50 | 10 | 2 | 47 | 0.80 | 1.59 |
| 55 | 10 | 4 | 51 | 0.86 | 1.57 |
| 60 | 10 | 2 | 53 | 0.90 | 1.50 |
| 65 | 10 | 0 | 53 | 0.90 | 1.38 |
| 70 | 10 | 0 | 53 | 0.90 | 1.28 |
| 75 | 10 | 0 | 53 | 0.90 | 1.20 |
| 80 | 10 | 2 | 55 | 0.93 | 1.17 |
| 85 | 10 | 4 | 59 | 1.00 | 1.18 |
| 90 | 10 | 0 | 59 | 1.00 | 1.11 |
| 95 | 10 | 0 | 59 | 1.00 | 1.05 |
| 100 | 10 | 0 | 59 | 1.00 | 1.00 |
from sklearn.inspection import permutation_importance
import time
estimator = full_pipeline
start_time = time.time()
result = permutation_importance(
estimator, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")
feature_names = X_train.columns.to_list()
forest_importances = pd.Series(result.importances_mean, index=feature_names)
forest_importances = forest_importances.sort_values(ascending=False)
Elapsed time to compute the importances: 8.224 seconds
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.set_size_inches(9, 6)
fig.tight_layout()
plt.show()